options(warn=-1)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
##
## tidyverse installed
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
##
## tidyverse installed
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
##
## tidyverse installed
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)1.1. Excluimos las columnas no numéricas que no nos interesan para este análisis.
excluded_columns <- c(
'Neo.Reference.ID',
'Name',
'Close.Approach.Date',
'Epoch.Date.Close.Approach',
'Orbit.ID',
'Orbit.Determination.Date',
'Orbiting.Body',
'Est.Dia.in.Feet.min.',
'Est.Dia.in.Feet.max.',
'Est.Dia.in.M.min.',
'Est.Dia.in.M.max.',
'Est.Dia.in.KM.min.',
'Est.Dia.in.KM.max.',
'Equinox',
'Orbit.Uncertainity'
)excluirnos observaciones con valores faltaste.
ds_step_1 <- loadcsv('../datasets/nasa.csv') %>%
dplyr::select(-excluded_columns) %>%
na.omit## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
str(ds_step_1)## 'data.frame': 4687 obs. of 25 variables:
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Hazardous : chr "True" "False" "True" "False" ...
Excluimos las columnas que tiene un correlación mayor al 80%.
high_correlated_columns <- find_high_correlated_columns(
feat(ds_step_1),
cutoff=0.8
)
length(high_correlated_columns)## [1] 11
length(ds_step_1)## [1] 25
ds_step_2 <- ds_step_1 %>% dplyr::select(-high_correlated_columns[-1])
rm(ds_step_1)
length(ds_step_2)## [1] 15
Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa la clases.
result <- features_importance(ds_step_2, target = 'Hazardous')## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
result##
## Call:
## randomForest(x = features, y = target, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0.34%
## Confusion matrix:
## False True class.error
## False 3925 7 0.001780264
## True 9 746 0.011920530
plot_features_importance(result)n_best_features = 5
# n_best_features = 15
best_features <- top_acc_features(result, top=n_best_features)## Selecting by MeanDecreaseGini
best_features## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"
## [3] "Est.Dia.in.Miles.min." "Perihelion.Distance"
## [5] "Inclination"
length(best_features)## [1] 5
ds_step_3 <- ds_step_2 %>% dplyr::select(c(best_features, c(Hazardous)))## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# rm(ds_step_2)str(ds_step_3)## 'data.frame': 4687 obs. of 6 variables:
## $ Minimum.Orbit.Intersection: num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Hazardous : chr "True" "False" "True" "False" ...
Ahora comparemos el resultado de feature_importance con PCA.
plot_pca_original_variables(feat(ds_step_3))## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4687 individuals, described by 5 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
ds_step_4 <- ds_step_3 %>%
mutate(Hazardous = case_when(Hazardous %in% c('True') ~ 1, TRUE ~ 0))
str(ds_step_4)## 'data.frame': 4687 obs. of 6 variables:
## $ Minimum.Orbit.Intersection: num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Hazardous : num 1 0 1 0 1 0 0 0 0 1 ...
coparative_boxplot(feat(ds_step_4), to_col=n_best_features)comparative_histplot(feat(ds_step_4), to_col=n_best_features)## [[1]]
## [1] 3.7e-24
##
## [[2]]
## [1] 3.7e-24
##
## [[3]]
## [1] 3.7e-24
##
## [[4]]
## [1] 3.7e-24
##
## [[5]]
## [1] 3.7e-24
Observaciones: Al parecer una sola variable parece ser normal.
uni_shapiro_test(feat(ds_step_4))## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.82145, p-value < 2.2e-16
##
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.9888, p-value < 2.2e-16
##
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.41945, p-value < 2.2e-16
##
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.98627, p-value < 2.2e-16
##
## [1] "=> Variable: Inclination"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.88555, p-value < 2.2e-16
Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo en una nunca variable donde parece tender anormalidad.
mult_shapiro_test(feat(ds_step_4))##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.29729, p-value < 2.2e-16
Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado del test de shapiro uni-variado pero no con el qqplot de cada variable. Entiendo que todos las variables se acercan a una normal, pero no o son completamente.
uni_levene_test(feat(ds_step_4), ds_step_4$Hazardous)## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 576.34 < 2.2e-16 ***
## 4685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 500.87 < 2.2e-16 ***
## 4685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.3586 0.5493
## 4685
## [1] ""
## [1] ""
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 16.241 5.667e-05 ***
## 4685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "=> Variable: Inclination"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 3.099 0.0784 .
## 4685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
Observaciones
multi_boxm_test(feat(ds_step_4), ds_step_4$Hazardous)##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: features
## Chi-Sq (approx.) = 3154.8, df = 15, p-value < 2.2e-16
Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables tiene no son homocedasticas.
plot_correlations(feat(ds_step_4))## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4687 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4687 individuals, described by 5 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
outliers para poder ver con mas claridad el biplot.
ds_without_outliers <- filter_outliers(ds_step_4, max_score=0.52)plot_robust_pca(ds_without_outliers)c(raw_train_set, raw_test_set) %<-% train_test_split(ds_step_4, train_size=.8)## [1] "Train Set size: 3749"
## [1] "Test set size: 938"
# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_setdesvío).
train_set <- balanced_train_set %>%
mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set <- raw_test_set %>%
mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
rm(balanced_train_set)
rm(raw_test_set)reg_formula <- formula(Hazardous~.)
lda_model <- lda(reg_formula, train_set)
lda_test_pred <- predict(lda_model, test_set)
plot_cm(lda_test_pred$class, test_set$Hazardous)graphics.off()
plot_roc(lda_test_pred$class, test_set$Hazardous)##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 154
## Mean of positive(s): 1.656
## Variance of positive(s): 0.2272
## ===== Negative(s) =====
## Number of negative(s): 784
## Mean of negative(s): 1.056
## Variance of negative(s): 0.05304
## ===== AUC =====
## Area under curve: 0.7999
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.05612 0.6558
## 1.00000 1.0000
f_beta_score(lda_test_pred$class, test_set$Hazardous, beta=2)## [1] "F2Score: 0.287720926667887"
lda_model$scaling## LD1
## Minimum.Orbit.Intersection -1.25718073
## Absolute.Magnitude -1.59924349
## Est.Dia.in.Miles.min. -0.41638970
## Perihelion.Distance 0.06717141
## Inclination -0.01172453
qda_model <- qda(reg_formula, train_set)
qda_test_pred <- predict(qda_model, test_set)
plot_cm(qda_test_pred$class, test_set$Hazardous)plot_roc(qda_test_pred$class, test_set$Hazardous)##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 154
## Mean of positive(s): 1.903
## Variance of positive(s): 0.08849
## ===== Negative(s) =====
## Number of negative(s): 784
## Mean of negative(s): 1.005
## Variance of negative(s): 0.005082
## ===== AUC =====
## Area under curve: 0.9487
##
## =========================
## FPR TPR
## 0.000000 0.0000
## 0.005102 0.9026
## 1.000000 1.0000
f_beta_score(qda_test_pred$class, test_set$Hazardous, beta=2)## [1] "F2Score: 0.0907401672344379"
rda_model <- rda(reg_formula, train_set)
rda_test_pred <- predict(rda_model, test_set)
plot_cm(rda_test_pred$class, test_set$Hazardous)plot_roc(rda_test_pred$class, test_set$Hazardous)##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 154
## Mean of positive(s): 1.903
## Variance of positive(s): 0.08849
## ===== Negative(s) =====
## Number of negative(s): 784
## Mean of negative(s): 1.005
## Variance of negative(s): 0.005082
## ===== AUC =====
## Area under curve: 0.9487
##
## =========================
## FPR TPR
## 0.000000 0.0000
## 0.005102 0.9026
## 1.000000 1.0000
f_beta_score(rda_test_pred$class, test_set$Hazardous, beta=2)## [1] "F2Score: 0.0907401672344379"
rl_model <- glm(reg_formula, train_set, family=binomial)
rl_test_pred <- predict(rl_model, test_set)
rl_threshold <- 0.4
rl_test_pred_threshold <- ifelse(rl_test_pred >= rl_threshold, 1, 0)
plot_cm(rl_test_pred_threshold, test_set$Hazardous)plot_roc(rl_test_pred_threshold, test_set$Hazardous)##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 154
## Mean of positive(s): 0.7273
## Variance of positive(s): 0.1996
## ===== Negative(s) =====
## Number of negative(s): 784
## Mean of negative(s): 0.01913
## Variance of negative(s): 0.01879
## ===== AUC =====
## Area under curve: 0.8541
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.01913 0.7273
## 1.00000 1.0000
f_beta_score(rl_test_pred_threshold, test_set$Hazardous, beta=2)## [1] "F2Score: 0.753701211305518"
svm_model <- svm(reg_formula, train_set, kernel="radial")
svm_test_pred <- predict(svm_model, test_set)
svm_threshold <- 0.5
svm_test_pred_threshold <- ifelse(svm_test_pred >= rl_threshold, 1, 0)
plot_text_cm(svm_test_pred_threshold, test_set$Hazardous)## Prediction
## Reality 0 1
## 0 762 22
## 1 22 132
plot_roc(svm_test_pred_threshold, test_set$Hazardous)##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 154
## Mean of positive(s): 0.8571
## Variance of positive(s): 0.1232
## ===== Negative(s) =====
## Number of negative(s): 784
## Mean of negative(s): 0.02806
## Variance of negative(s): 0.02731
## ===== AUC =====
## Area under curve: 0.9145
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.02806 0.8571
## 1.00000 1.0000
f_beta_score(svm_test_pred_threshold, test_set$Hazardous, beta=2)## [1] "F2Score: 0.857142857142857"
Típica con estimadores de la normal.
scaled_data_1 <- ds_step_4 %>%
dplyr::select(-Hazardous) %>%
mutate(~(scale(.) %>% as.vector))Escalamiento diferente de la tipica normal.
scaled_data_2 <- apply(
ds_step_4 %>% dplyr::select(-Hazardous),
2,
function(x) { (x - min(x)) / (max(x) - min(x))}
)clustering_metrics_plot(scaled_data_1)clustering_metrics_plot(scaled_data_2)n_clusters <- 2
kmeans_model <- kmeans(scaled_data_1, n_clusters)
km_ds_step_4 <- data.frame(ds_step_4)
km_ds_step_4$kmeans <- kmeans_model$clusterds_without_outliers <- filter_outliers(km_ds_step_4, max_score=0.5)plot_robust_pca(
ds_without_outliers %>% dplyr::select(-kmeans),
groups = factor(ds_without_outliers$kmeans),
)Matriz de distancias euclídeas
mat_dist <- dist(x = ds_step_4, method = "euclidean") Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")Calculo del coeficiente de correlación cofenetico
cor(x = mat_dist, cophenetic(hc_complete))## [1] 0.8024213
cor(x = mat_dist, cophenetic(hc_average))## [1] 0.8064622
cor(x = mat_dist, cophenetic(hc_single))## [1] 0.5078482
cor(x = mat_dist, cophenetic(hc_ward))## [1] 0.5889577
Construcción de un dendograma usando los resultados de la técnica de Ward
n_clusters <- 6
graphics.off()
plot(hc_ward )
rect.hclust(hc_ward, k=n_clusters, border="red")hc_ds <- data.frame(ds_step_4)
hc_ds$her_ward <- cutree(hc_ward, k=n_clusters)ds_without_outliers <- filter_outliers(hc_ds, max_score=0.53)plot_robust_pca(
ds_without_outliers %>% dplyr::select(-her_ward),
groups = factor(ds_without_outliers$her_ward),
colours=c("orange","cyan","blue","magenta","yellow","black"),
labels=c("grupo 1", "grupo 2","grupo 3","grupo 4","grupo 5","grupo 6")
)Realizado por Adrian Marino
adrianmarino@gmail.com